from nltk_manager import NLTKManager
import pandas as pd
from textblob import TextBlob
import re


class DataLoader:
    def __init__(self, path: str):
        self.nltk_manager = NLTKManager('../.venv/nltk_data')
        stop = self.nltk_manager.get_stopwords('english')
        self.path = path
        self.df = pd.read_csv(path)
        self.df['word_count'] = self.df['SentimentText'].apply(lambda x: len(str(x).split(' ')))
        self.df['avg_word'] = self.df['SentimentText'].apply(lambda x: self.avg_word(x))
        self.df['SentimentText'] = self.df['SentimentText'].apply(lambda sen: ' '.join(x.lower() for x in sen.split()))
        self.df['SentimentText'] = self.df['SentimentText'].str.replace('[^\w\s]', '')
        self.df['SentimentText'] = self.df['SentimentText'].apply(lambda x: str(TextBlob(x).correct()))
        self.df['SentimentText'] = self.df['SentimentText'].apply(
            lambda sen: ' '.join(x for x in sen.split() if x not in stop))
        freq = pd.Series(' '.join(self.df['SentimentText']).split()).value_counts()[-10:]
        self.df['SentimentText'] = self.df['SentimentText'].apply(
            lambda x: " ".join(x for x in x.split() if x not in freq))

    def __getitem__(self, key):
        if isinstance(key, tuple):
            if any(isinstance(k, slice) for k in key):
                return self.df.iloc[key]
            else:
                return self.df[list(key)]
        else:
            if isinstance(key, str):
                return self.df[key]
            else:
                return self.df.iloc[key]

    def __getattr__(self, item):
        if hasattr(self.df, item):
            return getattr(self.df, item)
        raise AttributeError(f"'{item}' object has no attribute '{item}'")

    def avg_word(self, sentence):
        words = sentence.split()
        return sum(len(word) for word in words) / len(words)


if __name__ == '__main__':
    x_sentiment = DataLoader('./data/twitter_data.csv')
    print(x_sentiment['SentimentText'].head())
